
*Filename: 4b_analysis-analysis-sample.do
*Created: 20201023
*Last edited: 20220705

/*Description: 
	- Creates dataset to use for regressions
	- Imposes income restrictions (imputed, top/bottom coded, incl/excl zeros)	
	
	The start of this file allows for user-specified choices regarding income
	exclusions, including: whether (major) imputed incomes and hours are excluded;
	whether top/bottom coded incomes are excluded; whether farm and the labor part 
	of business income is added to the total labor income measure for 1994-2019
	surveys (to create a measure more comparable to 1968-1993 surveys);
	whether we excluded zero incomes where positive hours are reported;
	and whether we exclude "high" values (as defined by Lee & Solon).
	
	Next user-specified choices define the construction of parent
	income measures (e.g., number of potential, parent incomes, maximum parental 
	age to observe parent income, etc.). 
	
	Child income measures are created last.
	
	There are two datasets produced:
		- analysis-sample.dta contains all data (allows for additional sensitivity checks, figures involving parents, etc.)
		- analysis-sample-main.dta contains only the main regression analysis sample (and includes the ranked income measures)
	
*/

******************************************************* 
clear 
//version 16.0
set more off
capture log close 
set maxvar 32000

 
*Log file
log using ${projdata}/analysis-sample.log, replace


*Open long version of dataset
use ${projdata}/analysis-sample-long.dta


*Keep only SRC and SEO samples 
drop if src==0 & seo==0

 
 
**** INCOME EXCLUSIONS ****

/* Exclude major imputed */
gl ExcludeImp=1
gen excl_imputed=${ExcludeImp}

/* Exclude top/bottom coded */
gl ExcludeTB=1
gen excl_topbotcoded=${ExcludeTB}

/* Add Farm/Bus to labor income in 1993-2018 */
gl AddFarmBus=1 
gen added_farm_bus_inc=${AddFarmBus}

/* Leave these both set to 1 to exclude zero incomes with positive hours */
gl Exclude0=0
gl PosHrs=0
gen excl_zeroinc_poshrs=${Exclude0}*${PosHrs}
if ${Exclude0}==1 &  ${PosHrs}==0 gen excl_allzeros=1
else gen excl_allzeros=0

/* FOR LATER ROBUSTNESS CHECKS: This will exclude high incomes as defined by Lee & Solon */
gl ExcludeHigh=0
gen excl_high_inc=${ExcludeHigh}





**** OTHER RESTRICTIONS FOR IDENTIFYING PARENT INCOMES ****

*Number of POTENTIAL incomes to require
loc poten_par_incomes=5
gen poten_par_incomes=`poten_par_incomes'

*Minimum number of OBSERVED incomes to require
loc min_par_incomes=2
gen min_par_incomes=`min_par_incomes'

*Child age at which we start observing parent incomes
loc par_inc_start_ch_age=12
gen par_inc_start_ch_age=`par_inc_start_ch_age'

*Max parent age for observing last potential income
loc max_par_age=65
gen max_par_age=`max_par_age'

*For "c" income measures, need cutoff to exclude "low" incomes 
loc lowest=100*(240.007/33.4)
gen low_inc_cutoff=`lowest'

*Number of incomes to use for MAX parent income measure
local num_highest_incomes=2
gen num_highest_incomes=`num_highest_incomes'





			 

*Create Parent age based on their birth year ("cohort", created for all individuals in individual-clean.do)
gen m_calcage=year-m_cohort if year>m_cohort
gen f_calcage=year-f_cohort if year>f_cohort


/*
	Create hours variables 
	
	Since hours worked (and income) are only reported for the Head/Wife, we create hours
	variables that only hold the hours for the individual (they must be the
	head/wife to have hours observed).
	
	"hd" is for "Head", which is identified by the relationship to head variable ("rel"))
	"wf" is for "Wife", also identified by the relationship to head variable 

		  
*/
gen hours=.
replace hours=hourshd if inlist(rel,1,10)		
replace hours=hourswf if inlist(rel,2,20,22)

gen m_hours=.
replace m_hours=m_hourshd if inlist(m_rel,1,10)
replace m_hours=m_hourswf if inlist(m_rel,2,20,22)

gen f_hours=.
replace f_hours=f_hourshd if inlist(f_rel,1,10)
replace f_hours=f_hourswf if inlist(f_rel,2,20,22)



/*
	Now we impose the income restrictions / exclusions
	as specified above by the user.
	
	The "exlusions" are imposed by replacing the income as missing
	if it does not satisfy the respective criteria. 
	
    These exclusions are imposed separately for child incomes,
	mother incomes (m_), and father incomes (f_).
*/

if ${ExcludeImp}==1 {
	
/* Exclude IMPUTED VALUES with this section */

*Exclude LABOR incomes imputed by MAJOR assignment
replace rinchd=. if S_inchd!=1 
replace rincwf=. if S_incwf!=1 
replace f_rinchd=. if f_S_inchd!=1 
replace f_rincwf=. if f_S_incwf!=1 
replace m_rinchd=. if m_S_inchd!=1 
replace m_rincwf=. if m_S_incwf!=1 
 
*Exclude FAMILY incomes in cases where NONLABOR income imputed by MAJOR assignment
replace   rfminc=. if   S_nonlabinc!=1
replace f_rfminc=. if f_S_nonlabinc!=1
replace m_rfminc=. if m_S_nonlabinc!=1

*Exclude FAMILY incomes in cases where LABOR income imputed by MAJOR assignment	
replace   rfminc=. if (  S_inchd!=1 |   S_incwf!=1)
replace f_rfminc=. if (f_S_inchd!=1 | f_S_incwf!=1)
replace m_rfminc=. if (m_S_inchd!=1 | m_S_incwf!=1)

*Exclude BUSINESS incomes in cases where imputed by MAJOR assignement (head/wife have sep income but only 1 accuracy var)
replace rearnbushd=. if S_businc!=1
replace rearnbuswf=. if S_businc!=1
replace f_rearnbushd=. if f_S_businc!=1 
replace f_rearnbuswf=. if f_S_businc!=1 
replace m_rearnbushd=. if m_S_businc!=1 
replace m_rearnbuswf=. if m_S_businc!=1 

*Exclude FARM incomes in cases where imputed by MAJOR assignement
replace   rfarminc=. if (  S_frminc!=1 |   S_frminc!=1)
replace f_rfarminc=. if (f_S_frminc!=1 | f_S_frminc!=1)
replace m_rfarminc=. if (m_S_frminc!=1 | m_S_frminc!=1)

}



if ${ExcludeTB}==1 {

/* Include/exclude TOP- or BOTTOM-CODED with this section */

*Exclude LABOR income cases that were TOP- or BOTTOM-CODED
replace rinchd=. if F_inchd_top==1
replace rincwf=. if F_incwf_top==1
replace f_rinchd=. if f_F_inchd_top==1
replace f_rincwf=. if f_F_incwf_top==1
replace m_rinchd=. if m_F_inchd_top==1
replace m_rincwf=. if m_F_incwf_top==1

*Exclude FAMILY income cases that were TOP- or BOTTOM-CODED 
replace rfminc=.   if   F_fminc_top==1
replace rfminc=.   if   F_fminc_bot==1
replace f_rfminc=. if f_F_fminc_top==1
replace f_rfminc=. if f_F_fminc_bot==1
replace m_rfminc=. if m_F_fminc_top==1
replace m_rfminc=. if m_F_fminc_bot==1 

*Exclude FARM income cases that were TOP- or BOTTOM-CODED 
replace rfarminc=.   if   F_farminc_top==1
replace rfarminc=.   if   F_farminc_bot==1
replace f_rfarminc=. if f_F_farminc_top==1
replace f_rfarminc=. if f_F_farminc_bot==1
replace m_rfarminc=. if m_F_farminc_top==1
replace m_rfarminc=. if m_F_farminc_bot==1 

}



if ${AddFarmBus}==1 {

/* Add FARM INCOME and BUSINESS INCOME to labor income with this section */

 /*
 
 We Add FARM income and labor part of BUSINESS income to the total labor income variable
 to obtain a labor income that is (more) comparable across years. Total labor income included
 the labor part of farm and labor part of business income for 1968-1993 surveys (1967-92 incomes)
 but then excluded these for all surveys since then (1993-2018 income years; 1994-2019 survey years).
 The labor part of farm income is not available though, so we have to add total farm income. This does
 result in a small number of negative (or zero) labor incomes (see below for how these are handled).
 
 */
 
 
 ***Child labor incomes
 
 *First get sum of total labor income, labor part of business income, and farm income
 egen temphd=rowtotal(rinchd rearnbushd rfarminc) 	if  (year>=1993 & year<=2018)	   		
 egen tempwf=rowtotal(rincwf rearnbuswf rfarminc)   if  (year>=1993 & year<=2018)	
 
   *Use this sum of total labor income if each of these incomes satisfies the exclusion restrictions above 
   *(the only way an income is missing is from these exclusions imposed above; all values are assigned in the PSID so zeros will be reported for, say, no farm income)
   replace rinchd= temphd   if  (year>=1993 & year<=2018) & (rinchd!=. | rearnbushd!=. | rfarminc!=.)
   replace rincwf= tempwf   if  (year>=1993 & year<=2018) & (rincwf!=. | rearnbuswf!=. | rfarminc!=.)	 
   drop temphd tempwf
   
   *Exclude incomes that are negative due to farm losses
   replace rinchd=. if rinchd<0
   replace rincwf=. if rincwf<0

*** Parent labor incomes

 *Now do the same for mother (m) and father (f) incomes
 foreach p in m f {
  egen `p'_temphd=rowtotal(`p'_rinchd `p'_rearnbushd `p'_rfarminc) 	if  (year>=1993 & year<=2018) 	   		
  egen `p'_tempwf=rowtotal(`p'_rincwf `p'_rearnbuswf `p'_rfarminc)  if  (year>=1993 & year<=2018)
   replace `p'_rinchd= `p'_temphd   if  (year>=1993 & year<=2018) & (`p'_rinchd!=. | `p'_rearnbushd!=. | `p'_rfarminc!=.)
   replace `p'_rincwf= `p'_tempwf   if  (year>=1993 & year<=2018) & (`p'_rincwf!=. | `p'_rearnbuswf!=. | `p'_rfarminc!=.)	 
   drop `p'_temphd `p'_tempwf	
   
   *Exclude incomes that are negative due to farm losses
   replace `p'_rinchd=. if `p'_rinchd<0
   replace `p'_rincwf=. if `p'_rincwf<0
   
 }


   
	*** If excluding imputed values, also need to impose these exclusions here on aggregate measures created for 1994-2018. 
	if ${ExcludeImp}==1 {
	replace rinchd=.   if (year>=1993 & year<=2018) &  (S_inchd!=1   | S_businc!=1   | S_frminc!=1)
	replace rincwf=.   if (year>=1993 & year<=2018) &  (S_incwf!=1   | S_businc!=1   | S_frminc!=1)
	replace f_rinchd=. if (year>=1993 & year<=2018) &  (f_S_inchd!=1 | f_S_businc!=1 | f_S_frminc!=1) 
	replace f_rincwf=. if (year>=1993 & year<=2018) &  (f_S_incwf!=1 | f_S_businc!=1 | f_S_frminc!=1) 
	replace m_rinchd=. if (year>=1993 & year<=2018) &  (m_S_inchd!=1 | m_S_businc!=1 | m_S_frminc!=1) 
	replace m_rincwf=. if (year>=1993 & year<=2018) &  (m_S_incwf!=1 | m_S_businc!=1 | m_S_frminc!=1) 
	}
 
}
 


if ${Exclude0}==1 {
	
/* Include/exclude NEGATIVE or ZERO values with this section */


  /*
	If the user specifies above that zero incomes with positive hours should be
	excluded, then these local macros add the positive hours restrictions
	to the lines dropping zero incomes below.
  */

 if ${PosHrs}==1 {
 	local PH 	"& (hours>0   & hours<.)"
	local m_PH 	"& (m_hours>0 & m_hours<.)"
	local f_PH 	"& (f_hours>0 & f_hours<.)"
 }
 
*Exclude NEGATIVE or ZERO INCOMES
//foreach var in rinchd rincwf rfminc rearnbushd rfarminc {
foreach var in rinchd rincwf rfminc {
	replace `var'=. if `var'<=0	`PH'
	replace m_`var'=. if m_`var'<=0 `m_PH'
	replace f_`var'=. if f_`var'<=0 `f_PH'
}

}

 
 
 

if ${ExcludeHigh}==1 {
	
/* Exclude LS HIGH VALUE  Outlier values with this section */

 loc highest=150000*(240.007/33.4)
 foreach var in rinchd rincwf rfminc {
 	
	replace `var'=.   if `var'>`highest'
	replace m_`var'=. if m_`var'>`highest'
	replace f_`var'=. if f_`var'>`highest'
 }
} 






*Require child/parent to be in household at time of interview to use income

 /* In a small number of cases, there may be two heads (or wives) of the household
    for a given survey year. This occurs when a Head/Wife was present at the time
	of the previous interview (year), but has since died or is institutionalized
	or is nonresponse. Thus, we need to assign the income to the Head or Wife
	that IS present at the time of the interview where the income is reported.
	Sequence numbers from 1-20 indicate present at time of the interview, so
	if an individual is not present at the time of the interview, we replace 
	the head and wife LABOR income with missing.
	
	Note: This imposes no restriction on family incomes.
 */
	
 replace rinchd=. if !inrange(seq,1,20)
 replace rincwf=. if !inrange(seq,1,20)

 replace m_rinchd=. if !inrange(m_seq,1,20)
 replace m_rincwf=. if !inrange(m_seq,1,20)

 replace f_rinchd=. if !inrange(f_seq,1,20)
 replace f_rincwf=. if !inrange(f_seq,1,20)





********************** PARENT INCOMES AND ASSOCIATED AGES  ***********************




*** Child's AGE when we start measuring the parental incomes (specified by user above)
loc amin=`par_inc_start_ch_age'


*** Get wave number to start measuring the 5 potential parent incomes
/* 
This dataset was constructed to be a "balanced" panel in that each individual has exactly 41 observations,
or 1 observation per wave. We can therefor use the logic/functions around (within-individual) _n to get the potential incomes.

	1st: If possible, we start when child age=`amin' (age 12). We consdier this wave plus the next four survey waves to hold the five potential incomes.
    
	2nd: For cohorts born earlier who are not in PSID at these ages, need to start in wave 1 which means we adjust to slightly older ages:
			1952 cohort: use age 15-19
			1953 cohort: use age 14-18
			1954 cohort: use age 13-17
			1955 cohort: use age 12-16
	
	3rd: If parent>65 at 5th potential income, move 5 potential incomes to earlier ages so last one is at age 65.
		 Given that this is parent-specific, do this separately for mothers, fathers.
		 Also, if during biennial years so parent not observed at age 65, then use age 64 as max age.
*/

sort newid year

gen temp`amin'=wave if year-cohort==`amin'	// get wave # when child age 12 (if survey year exists at this age)
replace temp`amin'=wave if temp`amin'==. & temp`amin'[_n-1]==. & newid==newid[_n-1] & year-cohort-1==`amin' // biennial years: start when child age 13		
replace temp`amin'=wave if wave==1 & cohort>=1952 & cohort<=1955	// older cohorts: can't start observing parent income until 1st wave (as laid out above)					
bysort newid: egen wave`amin'=total(temp`amin'), missing  // wave`amin' holds wave # for measuring 1st potential parent income; set to missing if temp`amin'==.

*Now number the (poten_par_incomes=5) potential incomes
gen par_inc_n=1+ wave - wave`amin' 
replace par_inc_n=. if par_inc_n<=0
replace par_inc_n=. if par_inc_n>`poten_par_incomes'



*Need to use earlier incomes for OLD mothers/fathers, defined as parents >65 (max_par_age=65) at last (5th) potential income measure

*** Create separate variables to index mother/father potential incomes separately to accomodate shift for old mothers/fathers
gen f_inc_n=par_inc_n
gen m_inc_n=par_inc_n
label var f_inc_n "Father's income # out of 5 potential incomes"
label var m_inc_n "Mother's income # out of 5 potential incomes"


* FATHERS

*First identify the valid 5th potential income
gen temp_f_`max_par_age'=1 if f_inc_n==5 & f_calcage>`max_par_age' & f_calcage<.  	/* flag last potential income obs if father >65 at this time */
bysort newid f_newid: egen f_over`max_par_age'=sum(temp_f_`max_par_age')			/* expand flag to cover all obs for this father-child pair */			
replace f_inc_n=. if f_over`max_par_age'==1											/* replace potential income index variable with missing if identified as "old" father */							
replace f_inc_n=5 if f_over`max_par_age'==1 & f_calcage==`max_par_age'				/* set 5th potential income to age 65 */
replace f_inc_n=5 if f_over`max_par_age'==1 & f_calcage==`max_par_age'-1 & f_inc_n==.  & year>=1996  /* biennial years: set 5th to age 64 if not observed at age 65 */

*Now get 1st-4th potential incomes using fact that each obs is a survey wave
sort newid year 									 
replace f_inc_n=4 if f_over`max_par_age'==1 & f_calcage[_n+1]==`max_par_age' & newid==newid[_n+1] & f_newid==f_newid[_n+1]
replace f_inc_n=3 if f_over`max_par_age'==1 & f_calcage[_n+2]==`max_par_age' & newid==newid[_n+2] & f_newid==f_newid[_n+2]
replace f_inc_n=2 if f_over`max_par_age'==1 & f_calcage[_n+3]==`max_par_age' & newid==newid[_n+3] & f_newid==f_newid[_n+3]
replace f_inc_n=1 if f_over`max_par_age'==1 & f_calcage[_n+4]==`max_par_age' & newid==newid[_n+4] & f_newid==f_newid[_n+4]

*Now get 1st-4th potential incomes for biennial years where parent is never observed at 65 
sort newid year 														
replace f_inc_n=4 if f_over`max_par_age'==1 & f_calcage[_n+1]==`max_par_age'-1 & newid==newid[_n+1] & f_newid==f_newid[_n+1] & f_inc_n==.  & year[_n+1]>=1996
replace f_inc_n=3 if f_over`max_par_age'==1 & f_calcage[_n+2]==`max_par_age'-1 & newid==newid[_n+2] & f_newid==f_newid[_n+2] & f_inc_n==.  & year[_n+2]>=1996
replace f_inc_n=2 if f_over`max_par_age'==1 & f_calcage[_n+3]==`max_par_age'-1 & newid==newid[_n+3] & f_newid==f_newid[_n+3] & f_inc_n==.  & year[_n+3]>=1996
replace f_inc_n=1 if f_over`max_par_age'==1 & f_calcage[_n+4]==`max_par_age'-1 & newid==newid[_n+4] & f_newid==f_newid[_n+4] & f_inc_n==.  & year[_n+4]>=1996


* MOTHERS

*First identify the valid 5th potential income
gen temp_m_`max_par_age'=1 if m_inc_n==5 & m_calcage>`max_par_age' & m_calcage<.  	/* flag last potential income obs if father >65 at this time */
bysort newid m_newid: egen m_over`max_par_age'=sum(temp_m_`max_par_age')			/* expand flag to cover all obs for this mother-child pair */			
replace m_inc_n=. if m_over`max_par_age'==1											/* replace potential income index variable with missing if identified as "old" mother */
replace m_inc_n=5 if m_over`max_par_age'==1 & m_calcage==`max_par_age'				/* set 5th potential income to age 65 */
replace m_inc_n=5 if m_over`max_par_age'==1 & m_calcage==`max_par_age'-1 & m_inc_n==.  & year>=1996  /* biennial years: set 5th to age 64 if not observed at age 65 */

*Now get 1st-4th potential incomes using fact that each obs is a survey wave
sort newid year 														
replace m_inc_n=4 if m_over`max_par_age'==1 & m_calcage[_n+1]==`max_par_age' & newid==newid[_n+1] & m_newid==m_newid[_n+1]
replace m_inc_n=3 if m_over`max_par_age'==1 & m_calcage[_n+2]==`max_par_age' & newid==newid[_n+2] & m_newid==m_newid[_n+2]
replace m_inc_n=2 if m_over`max_par_age'==1 & m_calcage[_n+3]==`max_par_age' & newid==newid[_n+3] & m_newid==m_newid[_n+3]
replace m_inc_n=1 if m_over`max_par_age'==1 & m_calcage[_n+4]==`max_par_age' & newid==newid[_n+4] & m_newid==m_newid[_n+4]

*Now get 1st-4th potential incomes for biennial years where parent is never observed at 65 
sort newid year 														
replace m_inc_n=4 if m_over`max_par_age'==1 & m_calcage[_n+1]==`max_par_age'-1 & newid==newid[_n+1] & m_newid==m_newid[_n+1] & m_inc_n==. & year[_n+1]>=1996
replace m_inc_n=3 if m_over`max_par_age'==1 & m_calcage[_n+2]==`max_par_age'-1 & newid==newid[_n+2] & m_newid==m_newid[_n+2] & m_inc_n==. & year[_n+2]>=1996
replace m_inc_n=2 if m_over`max_par_age'==1 & m_calcage[_n+3]==`max_par_age'-1 & newid==newid[_n+3] & m_newid==m_newid[_n+3] & m_inc_n==. & year[_n+3]>=1996
replace m_inc_n=1 if m_over`max_par_age'==1 & m_calcage[_n+4]==`max_par_age'-1 & newid==newid[_n+4] & m_newid==m_newid[_n+4] & m_inc_n==. & year[_n+4]>=1996


*Only use child-parent pairs with potential to observe at least 5 incomes 
bysort newid f_newid: egen f_num_poten_inc=count(f_inc_n)
bysort newid m_newid: egen m_num_poten_inc=count(m_inc_n)
  tab f_newid if f_num_poten_inc>0 & f_num_poten_inc<5 & inrange(cohort,1952,1993) // N=5 "old" fathers who are already age>61 at wave 1 
  tab m_newid if m_num_poten_inc>0 & m_num_poten_inc<5 & inrange(cohort,1952,1993) // N=2 "old" mothers who are already age>61 at wave 1 
replace f_inc_n=. if f_num_poten_inc<5 
replace m_inc_n=. if m_num_poten_inc<5 




**** PARENT Annual LABOR incomes and associated ages

gen tempFLAB=.		// father labor income
gen tempFLABYR=.	// father year at observed labor income 
gen tempFLABc=.		// father labor income - exclude low values "c"
gen tempFLABYRc=.	// father year at observed labor income (excluding low values "c")

gen tempMLAB=.		// mother labor income
gen tempMLABYR=.	// mother year at observed labor income
gen tempMLABc=.		// mother labor income - exclude low values "c"
gen tempMLABYRc=.	// mother year at observed labor income (excluding low values "c")

*Father labor incomes
 replace tempFLAB=f_rinchd 		 	if inrange(f_inc_n,1,`poten_par_incomes') & inlist(f_rel,1,10)		// get five potential incomes
 replace tempFLAB=f_rincwf 		 	if inrange(f_inc_n,1,`poten_par_incomes') & inlist(f_rel,2,20,22) 	// this allows for the father to be the "Spouse/Partner" in a same sex couple
 replace tempFLABYR=year 			if inrange(f_inc_n,1,`poten_par_incomes') & tempFLAB!=.				// get YEARS for the OBSERVED incomes out of the five potential incomes

 replace tempFLABc=tempFLAB			if tempFLAB>=`lowest' & tempFLAB!=.		// get the five potential incomes that are not "too low" for "c" measures
 replace tempFLABYRc=tempFLABYR 	if tempFLAB>=`lowest' & tempFLAB!=.		// get the corresponding parent income years
  
*Mother labor incomes
 replace tempMLAB=m_rinchd 		 	if inrange(m_inc_n,1,`poten_par_incomes') & inlist(m_rel,1,10)		// get potential incomes during years when mother is HEAD/REFERENCE PERSON
 replace tempMLAB=m_rincwf 		 	if inrange(m_inc_n,1,`poten_par_incomes') & inlist(m_rel,2,20,22)	// get potential incomes during years when mother is WIFE/SPOUSE/PARTNER
 replace tempMLABYR=year 			if inrange(m_inc_n,1,`poten_par_incomes') & tempMLAB!=.				// get YEARS for the OBSERVED incomes out of the five potential incomes

 replace tempMLABc=tempMLAB			if tempMLAB>=`lowest' & tempMLAB!=.		// get the five potential incomes that are not "too low" for "c" measures
 replace tempMLABYRc=tempMLABYR 	if tempMLAB>=`lowest' & tempMLAB!=.		// get the corresponding parent income years

 
*Count observed incomes, to restrict to obs when at least 2 parent incomes observed 
 bysort newid f_newid: egen f_LAB_N= count(tempFLAB)	
 bysort newid f_newid: egen f_LABc_N=count(tempFLABc)	
 bysort newid m_newid: egen m_LAB_N=count(tempMLAB)		
 bysort newid m_newid: egen m_LABc_N=count(tempMLABc)				

label var f_LAB_N "num of father's observed earnings"
label var f_LABc_N "num of father's earnings (excl low inc)"
label var m_LAB_N "num of mother's observed earnings"
label var m_LABc_N "num of mother's earnings (excl low inc)"
 
 *Replace with missing if not at least 2 observed  
 replace tempFLAB=. 		if f_LAB_N<min_par_incomes
 replace tempFLABYR=. 		if f_LAB_N<min_par_incomes
 replace tempFLABc=. 		if f_LABc_N<min_par_incomes
 replace tempFLABYRc=. 		if f_LABc_N<min_par_incomes 
 
 replace tempMLAB=. 		if m_LAB_N<min_par_incomes
 replace tempMLABYR=. 		if m_LAB_N<min_par_incomes 
 replace tempMLABc=. 		if m_LABc_N<min_par_incomes
 replace tempMLABYRc=. 		if m_LABc_N<min_par_incomes 
 
 
*Record each year of parent observed income 
 forv n=1/`poten_par_incomes' {
	gen f_tempYR_`n'=tempFLABYR   if f_inc_n==`n'	& tempFLAB!=. 		// get year of `n'th observed income 
	gen f_tempYRc_`n'=tempFLABYRc if f_inc_n==`n'	& tempFLABc!=. 	 
	bysort newid f_newid: egen f_LABYR_`n'=mean(f_tempYR_`n')		// fill this in for all child obs
	bysort newid f_newid: egen f_LABYRc_`n'=mean(f_tempYRc_`n')		
	label var f_LABYR_`n' "Year of father's inc num `n'"
	label var f_LABYRc_`n' "Year of father's inc num `n' (excl low)"	
	}
	drop f_tempYR_? f_tempYRc_?

 forv n=1/`poten_par_incomes' {
	gen m_tempYR_`n'=tempMLABYR   if m_inc_n==`n' & tempMLAB!=.		// get year at `n'th observed income
	gen m_tempYRc_`n'=tempMLABYRc if m_inc_n==`n' & tempMLABc!=.	
	bysort newid m_newid: egen m_LABYR_`n'=mean(m_tempYR_`n')		// fill this in for all child obs
	bysort newid m_newid: egen m_LABYRc_`n'=mean(m_tempYRc_`n')	
	label var m_LABYR_`n' "Year of mother's inc num `n'"
	label var m_LABYRc_`n' "Year of mother's inc num `n' (excl low)"
	}
	drop m_tempYR_? m_tempYRc_?

*Record each potential income (for IV approach)
 forv n=1/`poten_par_incomes' {
	gen f_temp_`n'=tempFLAB  if f_inc_n==`n'	& tempFLAB!=. 	// get the `n'th observed income 
	bysort newid f_newid: egen f_LAB_`n'=mean(f_temp_`n')		// fill this in for all child obs
	label var f_LAB_`n' "Income # `n' (of five potential incomes)"
	}
	drop f_temp_? 

 forv n=1/`poten_par_incomes' {
	gen m_temp_`n'=tempMLAB   if m_inc_n==`n' & tempMLAB!=.	// get the `n'th observed income
	bysort newid m_newid: egen m_LAB_`n'=mean(m_temp_`n')		// fill this in for all child obs
	label var m_LAB_`n' "Income # `n' (of five potential incomes)"
	}
	drop m_temp_? 


*** PARENT AVERAGE Income Measures ("c" for measures excluding low incomes)
 bysort newid: egen f_LAB=mean(tempFLAB)		
 bysort newid: egen f_LABc=mean(tempFLABc)					  
 bysort newid: egen f_LABYR=mean(tempFLABYR)		
 bysort newid: egen f_LABYRc=mean(tempFLABYRc)					  
 
 bysort newid: egen m_LAB=mean(tempMLAB)		
 bysort newid: egen m_LABc=mean(tempMLABc)		
 bysort newid: egen m_LABYR=mean(tempMLABYR)		
 bysort newid: egen m_LABYRc=mean(tempMLABYRc)	
 
label var f_LAB "father's earnings"
label var f_LABc "father's earnings (excl low inc)"
label var f_LABYR "avg year of father's earnings"
label var f_LABYRc "avg year of father's earnings (excl low inc)"

label var m_LAB "mother's earnings"
label var m_LABYR "avg year of mother's earnings"
label var m_LABc "mother's earnings (excl low inc)"
label var m_LABYRc "avg year of mother's earnings (excl low inc)"

			  
*** Log of parent average incomes (excluding "low" incomes; "c" measure)
 gen  f_LOGLABc=log(f_LABc)		
 gen  m_LOGLABc=log(m_LABc)	 
label var f_LOGLABc "father's log earnings (excl low inc)" 
label var m_LOGLABc "mother's log earnings (excl low inc)"
 

 

********* Combined Parent Income Measures

* Average of mother's and father's avg income measures	
 egen p_MFAVG =  rowmean(m_LAB  f_LAB) 	
 egen p_MFAVGc = rowmean(m_LABc f_LABc) 
 
* Highest (max) of mother's and father's avg income measures
 egen p_MFMAX = rowmax(m_LAB  f_LAB)
 egen p_MFMAXc = rowmax(m_LABc  f_LABc)

* Log of parent incomes
 gen  p_LOGMFAVGc=log(p_MFAVGc)
 gen  p_LOGMFMAXc=log(p_MFMAXc)		
 
label var p_MFAVG "average parental earnings"
label var p_MFAVGc "average parental earnings (excl low inc)"
label var p_LOGMFAVGc "log of average parental earnings (excl low inc)"
label var p_MFMAX "max parental earnings"
label var p_MFMAXc "max parental earnings (excl low inc)"
label var p_LOGMFMAXc "log of max parental earnings (excl low inc)"

 
********* MAXIMUM Parent Income Measures
*Separately for each parent: Take the highest two incomes observed during the 5 potential incomes. 
*Then average over these two incomes to get that parent's MAXLAB.


* FATHERS 

 *Sort potential incomes identified above from highest to lowest, and assign rank (1=highest)
 gsort newid f_newid -tempFLAB
 by newid f_newid: gen f_tempRANK=_n if tempFLAB!=. & f_newid!=. & f_LAB_N >=min_par_incomes & f_LAB_N!=.
 
 *Now fill in annual incomes and ages
 gen tempFMAXLAB=tempFLAB 			if inrange(f_tempRANK,1,`num_highest_incomes')	// get highest incomes
 gen tempFMAXLABAGE=f_calcage 		if inrange(f_tempRANK,1,`num_highest_incomes') & tempFMAXLAB==tempFLAB & tempFMAXLAB!=. // get associated ages

 *Take averages
 bysort newid: egen f_MAXLAB=mean(tempFMAXLAB)			// average of highest observed incomes (out of potential incomes identified above)
 bysort newid: egen f_MAXLABAGE=mean(tempFMAXLABAGE)	// average parent age

 label var f_MAXLAB "Father's max income"
 label var f_MAXLABAGE "Father's age at max income" 
 
	***Now do for "c" measure (excluding zeros and low incomes)
	*Sort highest to lowest, and assign rank (1=highest)
	gsort newid f_newid -tempFLABc
	by newid f_newid: gen f_tempRANKc=_n if tempFLABc!=. & f_newid!=. & f_LABc_N >=min_par_incomes & f_LABc_N!=.
 
	*Now fill in annual incomes and ages
	gen tempFMAXLABc=tempFLABc 		 	if inrange(f_tempRANKc,1,`num_highest_incomes') & tempFLABc>=`lowest' & tempFLABc!=.	// get highest incomes
	gen tempFMAXLABAGEc=f_calcage 		if inrange(f_tempRANKc,1,`num_highest_incomes') & tempFMAXLABc==tempFLABc & tempFMAXLABc!=. // get associated ages

	*Take averages (egen mean creates newvar=. if arguments all =.)
	bysort newid: egen f_MAXLABc=mean(tempFMAXLABc)				// average of highest observed incomes (out of potential incomes identified above)
	bysort newid: egen f_MAXLABAGEc=mean(tempFMAXLABAGEc)		// average parent age
	
 label var f_MAXLABc "Father's max income (excl low)"
 label var f_MAXLABAGEc "Father's age at max income (excl low)" 
	
* MOTHERS 
 
 *Sort potential incomes identified above from highest to lowest, and assign rank (1=highest)
 gsort newid m_newid -tempMLAB
 by newid m_newid: gen m_tempRANK=_n if tempMLAB!=. & m_newid!=. & m_LAB_N >=min_par_incomes & m_LAB_N!=.
 
 *Now fill in annual incomes and ages
 gen tempMMAXLAB=tempMLAB 			if inrange(m_tempRANK,1,`num_highest_incomes')  
 gen tempMMAXLABAGE=m_calcage 		if inrange(m_tempRANK,1,`num_highest_incomes')  & tempMMAXLAB==tempMLAB & tempMMAXLAB!=.

 *Take averages
 bysort newid: egen m_MAXLAB=mean(tempMMAXLAB)
 bysort newid: egen m_MAXLABAGE=mean(tempMMAXLABAGE)

 label var m_MAXLAB "Mother's max income"
 label var m_MAXLABAGE "Mother's age at max income"
	
	*Sort highest to lowest, and assign rank (1=highest)
	 gsort newid m_newid -tempMLABc
	 by newid m_newid: gen m_tempRANKc=_n if tempMLABc!=. & m_newid!=. & m_LABc_N >=min_par_incomes & m_LABc_N!=.

	*Now fill in annual incomes and ages
	gen tempMMAXLABc=tempMLABc 			if inrange(m_tempRANKc,1,`num_highest_incomes') & tempMLABc>=`lowest' & tempMLABc!=.
	gen tempMMAXLABAGEc=m_calcage 		if inrange(m_tempRANKc,1,`num_highest_incomes') & tempMMAXLABc==tempMLABc & tempMMAXLABc!=.

	*Take averages 
	bysort newid: egen m_MAXLABc=mean(tempMMAXLABc)
	bysort newid: egen m_MAXLABAGEc=mean(tempMMAXLABAGEc)
	
	label var m_MAXLABc "Mother's max income (excl low)"
	label var m_MAXLABAGEc "Mother's age at max income (excl low)"
	

*** Log of parent incomes	   
 gen  f_LOGMAXLABc=log(f_MAXLABc)		
 gen  m_LOGMAXLABc=log(m_MAXLABc)	
 
 label var f_LOGMAXLABc "Father's log of max incomes" 
 label var m_LOGMAXLABc "Mother's log of max incomes" 
 
 
 
 
 
***** "FAMILY STRUCTURE" DURING PARENT INCOME YEARS

* Parent years head/wife during potential income years
gen m_tempHD=.
gen f_tempHD=.
gen m_tempWF=.
gen f_tempWF=.

replace m_tempHD=1 if inrange(m_seq,1,20) & inlist(m_rel,1,10) & inrange(m_inc_n,1,`poten_par_incomes')
replace f_tempHD=1 if inrange(f_seq,1,20) & inlist(f_rel,1,10) & inrange(f_inc_n,1,`poten_par_incomes') 

replace m_tempWF=1 if inrange(m_seq,1,20) & inlist(m_rel,2,20,22) & inrange(m_inc_n,1,`poten_par_incomes') 
replace f_tempWF=1 if inrange(f_seq,1,20) & inlist(f_rel,2,20,22) & inrange(f_inc_n,1,`poten_par_incomes') 

bysort newid f_newid: egen f_nyrs_hd=sum(f_tempHD)
bysort newid m_newid: egen m_nyrs_hd=sum(m_tempHD)

bysort newid f_newid: egen f_nyrs_wf=sum(f_tempWF)
bysort newid m_newid: egen m_nyrs_wf=sum(m_tempWF)

label var f_nyrs_hd "# potential income years father HEAD (rel=1,10; seq=1-20)"
label var m_nyrs_hd "# potential income years mother HEAD (rel=1,10; seq=1-20)"
label var f_nyrs_wf "# potential income years father WIFE (rel=2,20,22; seq=1-20)"
label var m_nyrs_wf "# potential income years mother WIFE (rel=2,20,22; seq=1-20)"

drop m_temp* f_temp* 


 
*****  EMPLOYMENT STATUS

* By year, get Median income for men in SRC sample that have positive income and are age 30-55 
preserve
  keep if src==1  // SRC
  keep if female==0 // men 
  keep if inrange(age,30,55) // age 30-55
  
  gen income=.
  replace income=rinchd if inlist(rel,1,10) & inrange(seq,1,20)
  replace income=rincwf if inlist(rel,2,20,22) & inrange(seq,1,20)
  drop if income==. 
  
  keep income year
  collapse (median) med_rinc=income (count) Nobs_med_rinc=income, by(year)
  label var med_rinc "Median of annual real income, SRC men age 30-55 with rinc>0"
  label var Nobs_med_rinc "# obs underlying Median of annual real income"

 tempfile medincome
 save `medincome'
restore

* Merge in median incomes
 merge m:1 year using `medincome', nogen


* Create employment dummies
gen employ=.
replace employ=1 if rinchd >= (.2*med_rinc) & rinchd!=. & inlist(rel,1,10) & inrange(seq,1,20) 
replace employ=0 if rinchd <  (.2*med_rinc) & rinchd!=. & inlist(rel,1,10) & inrange(seq,1,20)
replace employ=1 if rincwf >= (.2*med_rinc) & rincwf!=. & inlist(rel,2,20,22) & inrange(seq,1,20)
replace employ=0 if rincwf <  (.2*med_rinc) & rincwf!=. & inlist(rel,2,20,22) & inrange(seq,1,20)
lab var employ "Employed =1 if real income>= .2*median"

*Create employment dummies for each of the potential income years 
forv i=1/`poten_par_incomes' {
gen tempFEMPL`i'=.
replace tempFEMPL`i'=1 if f_rinchd >= (.2*med_rinc) & f_rinchd!=. & inlist(f_rel,1,10) & inrange(f_seq,1,20) & f_inc_n==`i'
replace tempFEMPL`i'=0 if f_rinchd <  (.2*med_rinc) & f_rinchd!=. & inlist(f_rel,1,10) & inrange(f_seq,1,20) & f_inc_n==`i'
replace tempFEMPL`i'=1 if f_rincwf >= (.2*med_rinc) & f_rincwf!=. & inlist(f_rel,2,20,22) & inrange(f_seq,1,20) & f_inc_n==`i'
replace tempFEMPL`i'=0 if f_rincwf <  (.2*med_rinc) & f_rincwf!=. & inlist(f_rel,2,20,22) & inrange(f_seq,1,20) & f_inc_n==`i'
bysort newid f_newid: egen f_empl`i'=mean(tempFEMPL`i')
lab var f_empl`i' "Father Employed =1 if real income (#`i')>= .2*median"
}
forv i=1/`poten_par_incomes' {
gen tempMEMPL`i'=.
replace tempMEMPL`i'=1 if m_rinchd >= (.2*med_rinc) & m_rinchd!=. & inlist(m_rel,1,10) & inrange(m_seq,1,20) & m_inc_n==`i'
replace tempMEMPL`i'=0 if m_rinchd <  (.2*med_rinc) & m_rinchd!=. & inlist(m_rel,1,10) & inrange(m_seq,1,20) & m_inc_n==`i'
replace tempMEMPL`i'=1 if m_rincwf >= (.2*med_rinc) & m_rincwf!=. & inlist(m_rel,2,20,22) & inrange(m_seq,1,20) & m_inc_n==`i'
replace tempMEMPL`i'=0 if m_rincwf <  (.2*med_rinc) & m_rincwf!=. & inlist(m_rel,2,20,22) & inrange(m_seq,1,20) & m_inc_n==`i'
bysort newid m_newid: egen m_empl`i'=mean(tempMEMPL`i')
lab var m_empl`i' "Mother Employed =1 if real income (#`i')>= .2*median"
}
* Parent EMPLOYMENT measures (averaged over observed parental incomes during five "potential" years) 
egen f_emplavg=rowmean(f_empl?)
egen m_emplavg=rowmean(m_empl?)
label var f_emplavg "Father: average employment status over observed potential incomes"
label var m_emplavg "Mother: average employment status over observed potential incomes"


* Now just get employment at first nonmissing potential income measure
egen f_empl1st=rowfirst(f_empl?)
egen m_empl1st=rowfirst(m_empl?)
label var f_empl1st "Father: 1st nonmissing employment over observed potential incomes"
label var m_empl1st "Mother: 1st nonmissing employment  over observed potential incomes"




 

*** Create employment measure based on PSID hours report (corresponds to income year)

*The below calculation is based on hours reports only (not excluding imputed hours?), and requires a valid income in that year for parents
// 15*52 = min 780 hours (arbitrarily decided on this; there may be a more established threshold in literature)
//local MinHours=780 
local MinHours=480 // arbitrary...4 weeks full time or >9 hours/week full year

gen employh=.
replace employh=0 if (hours< `MinHours' ) & inrange(seq,1,20) & inlist(rel,1,2,10,20,22) 
replace employh=1 if (hours>=`MinHours' & hours<.) & inrange(seq,1,20) & inlist(rel,1,2,10,20,22) 


forv i=1/`poten_par_incomes' {
gen tempFH`i'=.
replace tempFH`i'=0 if (f_hours< `MinHours' ) & inrange(f_seq,1,20) & inlist(f_rel,1,2,10,20,22) & tempFLAB<. & f_inc_n==`i'
replace tempFH`i'=1 if (f_hours>=`MinHours' & f_hours<.) & inrange(f_seq,1,20) & inlist(f_rel,1,2,10,20,22) & tempFLAB<. & f_inc_n==`i'
bysort newid f_newid: egen f_emplh`i'=mean(tempFH`i')
lab var f_emplh`i' "Father Employed if hours>`MinHours' (potential income `i')"
}
forv i=1/`poten_par_incomes' {
gen tempMH`i'=.
replace tempMH`i'=0 if (m_hours< `MinHours' ) & inrange(m_seq,1,20) & inlist(m_rel,1,2,10,20,22) & tempMLAB<. & m_inc_n==`i' 
replace tempMH`i'=1 if (m_hours>=`MinHours' & m_hours<.) & inrange(m_seq,1,20) & inlist(m_rel,1,2,10,20,22) & tempMLAB<. & m_inc_n==`i'
bysort newid m_newid: egen m_emplh`i'=mean(tempMH`i')
lab var m_emplh`i' "Mother Employed if hours>`MinHours' (potential income `i')"
}

*Avg measure
egen f_emplhavg=rowmean(f_emplh?)
egen m_emplhavg=rowmean(m_emplh?)
label var f_emplhavg "Father: avg empl status over observed incomes with reported hours (>=`MinHours' hours)"
label var m_emplhavg "Mother: avg empl status over observed incomes with reported hours (>=`MinHours' hours)"

*First nonmissing 
egen f_emplh1st=rowfirst(f_emplh?)
egen m_emplh1st=rowfirst(m_emplh?)
label var f_emplh1st "Father: 1st nonmissing employment (hrs) over observed potential incomes"
label var m_emplh1st "Mother: 1st nonmissing employment (hrs) over observed potential incomes"



*** Now just use ZERO hours as threshold
local MinHours0=0

gen employ0h=.
replace employ0h=0 if (hours==`MinHours0' ) & inrange(seq,1,20) & inlist(rel,1,2,10,20,22) 
replace employ0h=1 if (hours>`MinHours0' & hours<.) & inrange(seq,1,20) & inlist(rel,1,2,10,20,22) 


forv i=1/`poten_par_incomes' {
gen tempF0H`i'=.
replace tempF0H`i'=0 if (f_hours==`MinHours0' ) & inrange(f_seq,1,20) & inlist(f_rel,1,2,10,20,22) & tempFLAB<. & f_inc_n==`i'
replace tempF0H`i'=1 if (f_hours>`MinHours0' & f_hours<.) & inrange(f_seq,1,20) & inlist(f_rel,1,2,10,20,22) & tempFLAB<. & f_inc_n==`i'
bysort newid f_newid: egen f_empl0h`i'=mean(tempF0H`i')
lab var f_empl0h`i' "Father Employed if hours>`MinHours0' (potential income `i')"
}
forv i=1/`poten_par_incomes' {
gen tempM0H`i'=.
replace tempM0H`i'=0 if (m_hours== `MinHours0' ) & inrange(m_seq,1,20) & inlist(m_rel,1,2,10,20,22) & tempMLAB<. & m_inc_n==`i' 
replace tempM0H`i'=1 if (m_hours>`MinHours0' & m_hours<.) & inrange(m_seq,1,20) & inlist(m_rel,1,2,10,20,22) & tempMLAB<. & m_inc_n==`i'
bysort newid m_newid: egen m_empl0h`i'=mean(tempM0H`i')
lab var m_empl0h`i' "Mother Employed if hours>`MinHours0' (potential income `i')"
}

*Avg measure
egen f_empl0havg=rowmean(f_empl0h?)
egen m_empl0havg=rowmean(m_empl0h?)
label var f_empl0havg "Father: avg empl status over observed incomes with reported hours (>=`MinHours0' hours)"
label var m_empl0havg "Mother: avg empl status over observed incomes with reported hours (>=`MinHours0' hours)"









********************* CHILD INCOME VARIABLES **************************

*Annual Child income   
gen LAB=.
gen LABc=.
gen LOGLABc=.
gen tempLABAGE=.
gen tempLABAGEc=.

replace LAB=rinchd   			if (year-cohort>=25 & year-cohort<=48) & inlist(rel,1,10) & (rinchd<.)			// get labor income if child is head
replace LAB=rincwf  			if (year-cohort>=25 & year-cohort<=48) & inlist(rel,2,20,22) & (rincwf<.)		// get labor income if child is wife or "wife"
replace tempLABAGE=year-cohort  if (year-cohort>=25 & year-cohort<=48) & inlist(rel,1,10) & (rinchd<.)   	& LAB==rinchd // get associated ages
replace tempLABAGE=year-cohort  if (year-cohort>=25 & year-cohort<=48) & inlist(rel,2,20,22) & (rincwf<.)   & LAB==rincwf // get associated ages
replace LABc=LAB 				if LAB>=`lowest' & LAB!=. // "c" income measure for IGEs (excluding "low" labor incomes)
replace tempLABAGEc=tempLABAGE 	if LAB>=`lowest' & LAB!=. // ages corresponding to "c" income measure for IGEs (excluding "low" labor incomes)
replace LOGLABc=log(LABc) 

label var LAB "Child earnings"
label var LABc "Child earnings (excl low)"
label var LOGLABc "Child log earnings (excl low)"
			  
*Number of observed incomes 
bysort newid: egen LAB_N=count(LAB)
bysort newid: egen LABc_N=count(LABc)

label var LAB_N "# child earnings"
label var LABc_N "# child earnings (excl low)"

*Child age variable
gen AGE=.
replace AGE=year-cohort if year>cohort & cohort<.
label var AGE "Child age (year-cohort)"


drop temp* 		


*******************************************************************************
*******************************************************************************

* SAVE FULL DATASET
qui compress
sort newid year
save ${projdata}/analysis-sample.dta, replace


* IMPOSE SAMPLE RESTRICTIONS, COMPUTE RANKS, AND SAVE MAIN ANALYSIS SAMPLE
use ${projdata}/analysis-sample.dta, clear
keep if src==1 // SRC sample only
keep if inrange(AGE,25,48)
keep if inrange(cohort,1952,1993)
drop if (m_newid==. & f_newid==.) 	// matched to at least one parent
drop if m_LAB==. & f_LAB==.			// observe at least one parent's income
drop if LAB==.
keep if year>=1977

*Ranks by child birth cohort and child gender (main analyses)
local vars "LAB f_LAB m_LAB p_MFAVG p_MFMAX"
foreach x of local vars {
	bysort cohort female: egen n=count(`x') 
	bysort cohort female: egen i=rank(`x')  
	bysort cohort female: gen p`x'=(i-1)/(n-1)*100 
	drop n i
}

*Ranks by child birth cohort, pooling across child gender 
local vars "LAB f_LAB m_LAB p_MFAVG p_MFMAX"
foreach x of local vars {
	bysort cohort: egen n=count(`x') 
	bysort cohort: egen i=rank(`x')  
	bysort cohort: gen p`x'_p=(i-1)/(n-1)*100 
	drop n i
}

label var pLAB "Child earnings rank, by cohort and gender"
label var pm_LAB "Mother's earnings rank, by child cohort and gender"
label var pf_LAB "Father's earnings rank, by child cohort and gender"
label var pp_MFAVG "Parent avg earnings rank, by child cohort and gender"
label var pp_MFMAX "Parent max earnings rank, by child cohort and gender"

label var pLAB_p "Child earnings rank, by cohort pooled across gender"
label var pm_LAB_p "Mother's earnings rank, by child cohort pooled across child gender"
label var pf_LAB_p "Father's earnings rank, by child cohort pooled across child gender"
label var pp_MFAVG_p "Parent avg earnings rank, by child cohort pooled across gender"
label var pp_MFMAX_p "Parent max earnings rank, by child cohort pooled across gender"


qui compress
sort newid year
save ${projdata}/analysis-sample-main.dta, replace





clear
log close



*End 4b_analysis-sample.do
